library(data.table)
library(rmarkdown)
library(AnnotationHub)
library(tidyverse)
library(tximport)
library(ggplot2)
library(DESeq2)
library(pheatmap)
library(gridExtra)
library(UpSetR)
library(ensembldb)
library(apeglm)
library(ashr)
DB <- "EnsDb" # Set your DB of interest
AnnotationSpecies <- "Homo sapiens" # Set your species
ah <- AnnotationHub(hub=getAnnotationHubOption("URL")) # Bring annotation DB
# Filter annotation of interest
ahQuery <- query(ah,
pattern=c(DB, AnnotationSpecies),
ignore.case=T)
# Select the most recent data
DBName <- mcols(ahQuery) %>%
rownames() %>%
tail(1)
AnnoDb <- ah[[DBName]]
# Explore your EnsDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="TXID")
# Note: Annotation has to be done with not genome but transcripts
AnnoDb <- select(AnnoDb,
AnnoKey,
keytype="TXID",
columns=c("GENEID", "GENENAME"))
# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb) # The column 1 has to assign transcript (e.g. ENSEMBLTRANS)
## TXID GENEID GENENAME
## 1 ENST00000000233 ENSG00000004059 ARF5
## 2 ENST00000000412 ENSG00000003056 M6PR
## 3 ENST00000000442 ENSG00000173153 ESRRA
## 4 ENST00000001008 ENSG00000004478 FKBP4
## 5 ENST00000001146 ENSG00000003137 CYP26B1
## 6 ENST00000002125 ENSG00000003509 NDUFAF7
.sf files have been created from fastq data by salmon
# This code chunk needs to be written by yourself
#
# Define sample names
SampleNames <- c(paste0("Mock-rep", 1:3), paste0("SARS-CoV-2-rep", 1:3))
# Define group level
GroupLevel <- c("Mock", "Covid")
# Define contrast for DE analysis
Contrast <- c("Group", GroupLevel)
# Define a vector for comparing TPM vs Counts effect
TvC <- c("TPM", "Counts")
levels(TvC) <- TvC
# Define .sf file path
sf <- c(paste0("salmon_output/",
SampleNames,
".salmon_quant/quant.sf"))
# Define sample groups
group <- c(rep(GroupLevel[1], 3), rep(GroupLevel[2], 3))
# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
Group=factor(group, levels=GroupLevel),
Path=sf)
rownames(metadata) <- SampleNames
# Explore the metadata
print(metadata)
## Sample Group
## Mock-rep1 Mock-rep1 Mock
## Mock-rep2 Mock-rep2 Mock
## Mock-rep3 Mock-rep3 Mock
## SARS-CoV-2-rep1 SARS-CoV-2-rep1 Covid
## SARS-CoV-2-rep2 SARS-CoV-2-rep2 Covid
## SARS-CoV-2-rep3 SARS-CoV-2-rep3 Covid
## Path
## Mock-rep1 salmon_output/Mock-rep1.salmon_quant/quant.sf
## Mock-rep2 salmon_output/Mock-rep2.salmon_quant/quant.sf
## Mock-rep3 salmon_output/Mock-rep3.salmon_quant/quant.sf
## SARS-CoV-2-rep1 salmon_output/SARS-CoV-2-rep1.salmon_quant/quant.sf
## SARS-CoV-2-rep2 salmon_output/SARS-CoV-2-rep2.salmon_quant/quant.sf
## SARS-CoV-2-rep3 salmon_output/SARS-CoV-2-rep3.salmon_quant/quant.sf
# Assign sample names to the input (.sf) file path
names(sf) <- SampleNames
# Run tximport
# tpm vs original counts
# input sf: a factor of all .sf files' path
txi_tpm <- tximport(sf,
type="salmon",
tx2gene=AnnoDb,
countsFromAbundance="lengthScaledTPM", # Extracts TPM
ignoreTxVersion=T)
txi_counts <- tximport(sf,
type="salmon",
tx2gene=AnnoDb,
ignoreTxVersion=T)
# tpm
head(txi_tpm$counts)
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1
## ENSG00000000003 9517.232304 7375.7279909 7413.6544 10930.25206
## ENSG00000000005 7.046584 3.9312090 0.0000 2.00487
## ENSG00000000419 878.259656 916.2032201 728.9493 1201.89725
## ENSG00000000457 711.680520 553.4857448 561.2216 777.32713
## ENSG00000000460 465.052498 352.6689833 391.7890 731.41427
## ENSG00000000938 0.000000 0.9944634 0.0000 0.00000
## SARS-CoV-2-rep2 SARS-CoV-2-rep3
## ENSG00000000003 6030.5160049 7834.953120
## ENSG00000000005 0.9864265 4.002106
## ENSG00000000419 783.6348859 991.725547
## ENSG00000000457 459.4814539 603.460367
## ENSG00000000460 377.0127208 536.898627
## ENSG00000000938 0.0000000 0.000000
dim(txi_tpm$counts)
## [1] 60240 6
# counts
head(txi_counts$counts)
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000003 8991.942 7660.942 7504.000 11037.869 6193.989
## ENSG00000000005 7.000 4.000 0.000 2.000 1.000
## ENSG00000000419 889.000 928.001 730.000 1211.001 777.000
## ENSG00000000457 699.297 564.374 566.745 832.783 437.987
## ENSG00000000460 452.703 366.157 397.262 740.218 388.013
## ENSG00000000938 0.000 1.000 0.000 0.000 0.000
## SARS-CoV-2-rep3
## ENSG00000000003 7772.929
## ENSG00000000005 4.000
## ENSG00000000419 991.000
## ENSG00000000457 596.733
## ENSG00000000460 514.268
## ENSG00000000938 0.000
dim(txi_counts$counts)
## [1] 60240 6
# Set a function creating dds and vsd
dds_vsd_fn <- function(txi) {
# Create a DESeq object (so-calledd dds)
des <- DESeqDataSetFromTximport(txi,
colData=metadata,
design=~Group)
# Create a vsd object (so-called vsd)
ves <- vst(des, blind=T)
# Output them as a list
return(list(dds=des, vsd=ves))
}
TPM <- dds_vsd_fn(txi_tpm)
Counts <- dds_vsd_fn(txi_counts)
# Outputs
# dds: TPM/Counts[[1]] or TPM/Counts[['dds']]
# vsd: TPM/Counts[[2]] or TPM/Counts[['vsd']]
# TPM
TPM[[1]]
## class: DESeqDataSet
## dim: 60240 6
## metadata(1): version
## assays(1): counts
## rownames(60240): ENSG00000000003 ENSG00000000005 ... ENSG00000288674
## ENSG00000288675
## rowData names(0):
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
head(counts(TPM[[1]]))
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000003 9517 7376 7414 10930 6031
## ENSG00000000005 7 4 0 2 1
## ENSG00000000419 878 916 729 1202 784
## ENSG00000000457 712 553 561 777 459
## ENSG00000000460 465 353 392 731 377
## ENSG00000000938 0 1 0 0 0
## SARS-CoV-2-rep3
## ENSG00000000003 7835
## ENSG00000000005 4
## ENSG00000000419 992
## ENSG00000000457 603
## ENSG00000000460 537
## ENSG00000000938 0
# Counts
Counts[[1]]
## class: DESeqDataSet
## dim: 60240 6
## metadata(1): version
## assays(2): counts avgTxLength
## rownames(60240): ENSG00000000003 ENSG00000000005 ... ENSG00000288674
## ENSG00000288675
## rowData names(0):
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
head(counts(Counts[[1]]))
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000003 8992 7661 7504 11038 6194
## ENSG00000000005 7 4 0 2 1
## ENSG00000000419 889 928 730 1211 777
## ENSG00000000457 699 564 567 833 438
## ENSG00000000460 453 366 397 740 388
## ENSG00000000938 0 1 0 0 0
## SARS-CoV-2-rep3
## ENSG00000000003 7773
## ENSG00000000005 4
## ENSG00000000419 991
## ENSG00000000457 597
## ENSG00000000460 514
## ENSG00000000938 0
# TPM
TPM[[2]]
## class: DESeqTransform
## dim: 60240 6
## metadata(1): version
## assays(1): ''
## rownames(60240): ENSG00000000003 ENSG00000000005 ... ENSG00000288674
## ENSG00000288675
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(4): Sample Group Path sizeFactor
# Counts
Counts[[2]]
## class: DESeqTransform
## dim: 60240 6
## metadata(1): version
## assays(1): ''
## rownames(60240): ENSG00000000003 ENSG00000000005 ... ENSG00000288674
## ENSG00000288675
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
# Set a function estimating size factors, dispersions, and perform wald test
DESeqPrep_fn <- function(List) {
List[[1]] <- estimateSizeFactors(List[[1]])
List[[1]] <- estimateDispersions(List[[1]])
List[[1]] <- nbinomWaldTest(List[[1]])
return(List)
}
# Update dds with the function
Counts <- DESeqPrep_fn(Counts)
TPM <- DESeqPrep_fn(TPM)
sizeFactors(Counts[[1]])
## NULL
sizeFactors(TPM[[1]])
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## 1.0962587 1.0446876 0.8945224 1.3649447 0.7478616
## SARS-CoV-2-rep3
## 1.0185737
# Size factors don't exist in the Counts dds!
# Normalization factors are calculated in the Counts dds instead!
assays(Counts[[1]])
## List of length 6
## names(6): counts avgTxLength normalizationFactors mu H cooks
assays(TPM[[1]])
## List of length 4
## names(4): counts mu H cooks
colData(Counts[[1]])
## DataFrame with 6 rows and 3 columns
## Sample Group Path
## <factor> <factor> <character>
## Mock-rep1 Mock-rep1 Mock salmon_output/Mock-r..
## Mock-rep2 Mock-rep2 Mock salmon_output/Mock-r..
## Mock-rep3 Mock-rep3 Mock salmon_output/Mock-r..
## SARS-CoV-2-rep1 SARS-CoV-2-rep1 Covid salmon_output/SARS-C..
## SARS-CoV-2-rep2 SARS-CoV-2-rep2 Covid salmon_output/SARS-C..
## SARS-CoV-2-rep3 SARS-CoV-2-rep3 Covid salmon_output/SARS-C..
colData(TPM[[1]])
## DataFrame with 6 rows and 4 columns
## Sample Group Path sizeFactor
## <factor> <factor> <character> <numeric>
## Mock-rep1 Mock-rep1 Mock salmon_output/Mock-r.. 1.096259
## Mock-rep2 Mock-rep2 Mock salmon_output/Mock-r.. 1.044688
## Mock-rep3 Mock-rep3 Mock salmon_output/Mock-r.. 0.894522
## SARS-CoV-2-rep1 SARS-CoV-2-rep1 Covid salmon_output/SARS-C.. 1.364945
## SARS-CoV-2-rep2 SARS-CoV-2-rep2 Covid salmon_output/SARS-C.. 0.747862
## SARS-CoV-2-rep3 SARS-CoV-2-rep3 Covid salmon_output/SARS-C.. 1.018574
# Extract and save the size factors in a data frame
sizeFactor <- as.data.frame(round(sizeFactors(TPM[[1]]), 3))
colnames(sizeFactor) <- 'Size_Factor'
sizeFactor <- sizeFactor %>%
rownames_to_column(var="Sample") %>%
inner_join(metadata[, 1:ncol(metadata)-1], by="Sample")
sizeFactor$Sample <- factor(sizeFactor$Sample, levels=SampleNames)
# Create a plot comparing the size factors by sample
ggplot(sizeFactor, aes(x=Sample,
y=Size_Factor,
fill=Group,
label=Size_Factor)) +
geom_bar(stat="identity", width=0.8) +
theme_bw() +
ggtitle("Size Factors in TPM-DESeq") +
geom_text(vjust=1.5) +
theme(axis.text.x=element_text(angle=45,
vjust=0.5)) + ylab("Size Factor") + geom_hline(yintercept=1, color="blue", linetype="dashed")
# Extract and normalization factors in a data frame
normf <- as.data.frame(normalizationFactors(Counts[[1]])) %>%
gather(Sample, Normalization_Factor) %>%
inner_join(metadata[, 1:2], by="Sample")
normf$Sample <- factor(normf$Sample, levels=SampleNames)
normf$Group <- factor(normf$Group, levels=GroupLevel)
# Create a scatter plot showing distribution of normalization factors
normFactors_plot <- ggplot(normf,
aes(x=Sample, y=Normalization_Factor)) +
geom_jitter(alpha=0.5, aes(color=Group)) +
# Add a boxplot to provide statistics in each sample
geom_boxplot(aes(x=Sample, y=Normalization_Factor),
outlier.shape=NA, alpha=0.5) +
theme_bw() +
ggtitle("Normalization Factors in Counts-DESeq") +
theme(axis.text.x=element_text(angle=45,
vjust=0.5)) +
ylab("Normalization Factor / Gene") +
# Add a dashed horizontal line to indicate where normalization factor=1
geom_hline(yintercept=1,
color="blue",
linetype="dashed")
# Print the normalization factor scatter plot
print(normFactors_plot)
# Print the same plot with larger y-magnification in order to observe the box plot
normFactors_plot +
ylim(0.5, 1.5)
# Assigne what to compare
GroupOfInterest <- Contrast[1]
# Set a function for a PCA plot
QCPCA_fn <- function(inputList, Title) {
plotPCA(inputList[[2]], # takes vsd
intgroup=GroupOfInterest) + theme_bw() + ggtitle(Title)
}
# Set heatmap annotation
ColOfInterest <- !colnames(metadata) %in% c("Sample", "Path")
HeatmapAnno <- as.data.frame(metadata[, ColOfInterest])
rownames(HeatmapAnno) <- SampleNames
colnames(HeatmapAnno) <- colnames(metadata)[ColOfInterest]
# Set a function for a correlation heatmap
QCcorrHeatmap_fn <- function(inputList, Title) {
# Extract transformed count matrix
mtx <- assay(inputList[[2]]) # takes vsd
# Calculate correlation and store in the matrix
mtx <- cor(mtx)
# Create a correlation heatmap
return(pheatmap(mtx,
annotation=HeatmapAnno,
main=paste("Sample Correlation Heatmap:",
Title)))
}
grid.arrange(QCPCA_fn(TPM, "QC PCA: TPM"),
QCPCA_fn(Counts, "QC PCA: Counts"),
nrow=2)
# TPM
QCcorrHeatmap_fn(TPM, "TPM")
# Counts
QCcorrHeatmap_fn(Counts, "Counts")
# Create a list consisted with dds objects from TPM and Counts
desList <- list(TPM[[1]], Counts[[1]])
names(desList) <- TvC
# Create a list for TPM and Counts dds
ddsList <- desList # DE without shrinkage
normal.ddsList <- desList # DE with "normal" shrinkage
ape.ddsList <- desList # DE with "apeglm" shrinkage
ash.ddsList <- desList # DE with "ashr" shrinkage
for (x in TvC) {
# Run DESeq()
ddsList[[x]] <- DESeq(desList[[x]])
print(resultsNames(ddsList[[x]]))
normal.ddsList[[x]] <- lfcShrink(ddsList[[x]],
contrast=Contrast,
type="normal")
ape.ddsList[[x]] <- lfcShrink(ddsList[[x]],
coef=resultsNames(ddsList[[x]])[2],
type="apeglm")
ash.ddsList[[x]] <- lfcShrink(ddsList[[x]],
coef=resultsNames(ddsList[[x]])[2],
type="ashr")
}
## [1] "Intercept" "Group_Covid_vs_Mock"
## [1] "Intercept" "Group_Covid_vs_Mock"
# Combine every ddsList into a list
all.ddsList <- list(ddsList, normal.ddsList, ape.ddsList, ash.ddsList)
shrinkage <- c("None", "Normal", "Apeglm", "Ashr")
names(all.ddsList) <- shrinkage
# Plot dispersion
for (x in TvC) {
plotDispEsts(ddsList[[x]],
ylab="Dispersion",
xlab="Mean of Normalized Counts",
main=paste("Dispersion of", x, "Input"))
}
# Set FDR threshold
alpha=0.1
# FDR threshold vector
FDRv=c("< 0.1", "> 0.1")
# Initialize lists of result tables
all.resList <- all.ddsList
# Set a function cleaning table
Sig.fn <- function(df, Input) {
df <- df %>%
rownames_to_column(var="Gene") %>%
mutate(FDR=ifelse(padj < 0.1 & !is.na(padj),
FDRv[1],
FDRv[2]),
Input=Input)
return(df)
}
for (i in shrinkage) {
if (i == "None") {
for (x in TvC) {
# Extract data frames from unshrunken lfc & clean data
all.resList[[i]][[x]] <- as.data.frame(results(all.ddsList[[i]][[x]],
contrast=Contrast,
alpha=alpha)) %>%
Sig.fn(x)
}
} else {
# Extract data frames from shrunken lfc & clean data
for (x in TvC) {
all.resList[[i]][[x]] <- as.data.frame(all.ddsList[[i]][[x]]) %>%
Sig.fn(x)
}
}
}
# Explore the results
summary(all.resList)
## Length Class Mode
## None 2 -none- list
## Normal 2 -none- list
## Apeglm 2 -none- list
## Ashr 2 -none- list
head(all.resList[[1]][['TPM']])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000000003 7965.6928435 0.01606545 0.1265022 0.1269974 0.89894249
## 2 ENSG00000000005 2.8239528 0.61310858 1.5972550 0.3838514 0.70108859
## 3 ENSG00000000419 899.2563435 -0.21797908 0.1439985 -1.5137592 0.13008694
## 4 ENSG00000000457 596.8308344 0.02584059 0.1578318 0.1637223 0.86994974
## 5 ENSG00000000460 461.1928898 -0.38598899 0.1724779 -2.2379038 0.02522733
## 6 ENSG00000000938 0.1595373 0.96892188 4.0804729 0.2374533 0.81230512
## padj FDR Input
## 1 0.9648660 > 0.1 TPM
## 2 NA > 0.1 TPM
## 3 0.3742990 > 0.1 TPM
## 4 0.9545403 > 0.1 TPM
## 5 0.1284758 > 0.1 TPM
## 6 NA > 0.1 TPM
head(all.resList[[1]][['Counts']])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000000003 8079.0801619 0.01540008 0.1269664 0.1212925 0.90345933
## 2 ENSG00000000005 2.8594904 0.61234899 1.5811720 0.3872754 0.69855235
## 3 ENSG00000000419 912.4765014 -0.21785565 0.1443733 -1.5089747 0.13130525
## 4 ENSG00000000457 605.6119743 0.02391363 0.1576831 0.1516563 0.87945799
## 5 ENSG00000000460 467.6415182 -0.38743010 0.1721107 -2.2510517 0.02438226
## 6 ENSG00000000938 0.1610882 0.96751347 4.0804729 0.2371082 0.81257287
## padj FDR Input
## 1 0.9671957 > 0.1 Counts
## 2 NA > 0.1 Counts
## 3 0.3771258 > 0.1 Counts
## 4 0.9576948 > 0.1 Counts
## 5 0.1253609 > 0.1 Counts
## 6 NA > 0.1 Counts
head(all.resList[[2]][['TPM']])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000000003 7965.6928435 0.01566841 0.1233752 0.1269974 0.89894249
## 2 ENSG00000000005 2.8239528 0.12175895 0.3175826 0.3838514 0.70108859
## 3 ENSG00000000419 899.2563435 -0.21104412 0.1394188 -1.5137592 0.13008694
## 4 ENSG00000000457 596.8308344 0.02485951 0.1518409 0.1637223 0.86994974
## 5 ENSG00000000460 461.1928898 -0.36862111 0.1647104 -2.2379038 0.02522733
## 6 ENSG00000000938 0.1595373 0.03539765 0.1490746 0.2374533 0.81230512
## padj FDR Input
## 1 0.9648660 > 0.1 TPM
## 2 NA > 0.1 TPM
## 3 0.3742990 > 0.1 TPM
## 4 0.9545403 > 0.1 TPM
## 5 0.1284758 > 0.1 TPM
## 6 NA > 0.1 TPM
head(all.resList[[2]][['Counts']])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000000003 8079.0801619 0.01501330 0.1237769 0.1212925 0.90345933
## 2 ENSG00000000005 2.8594904 0.12268803 0.3171860 0.3872754 0.69855235
## 3 ENSG00000000419 912.4765014 -0.21082731 0.1397174 -1.5089747 0.13130525
## 4 ENSG00000000457 605.6119743 0.02299923 0.1516556 0.1516563 0.87945799
## 5 ENSG00000000460 467.6415182 -0.36991324 0.1643234 -2.2510517 0.02438226
## 6 ENSG00000000938 0.1610882 0.03503450 0.1477600 0.2371082 0.81257287
## padj FDR Input
## 1 0.9671957 > 0.1 Counts
## 2 NA > 0.1 Counts
## 3 0.3771258 > 0.1 Counts
## 4 0.9576948 > 0.1 Counts
## 5 0.1253609 > 0.1 Counts
## 6 NA > 0.1 Counts
# Set ylim: has to adjusted by users depending on data
yl <- c(-25, 25)
# Set min log2 fold change of interest
mLog <- c(-1, 1)
# Initialize a list storing MA plots
MAList <- ddsList
# Create MA plots
for (i in shrinkage) {
both.df <- rbind(all.resList[[i]][[TvC[1]]],
all.resList[[i]][[TvC[2]]])
MAList[[i]] <- ggplot(both.df,
aes(x=baseMean, y=log2FoldChange, color=FDR)) +geom_point() + scale_x_log10() + facet_grid(~Input) +
theme_bw() +
scale_color_manual(values=c("blue", "grey")) +
ggtitle(paste("MA plot with", i)) +
ylim(yl[1], yl[2]) +
geom_hline(yintercept=c(mLog[1], mLog[2]), linetype="dashed", color="red")
}
# Print MA plots
MAList
## $TPM
## class: DESeqDataSet
## dim: 60240 6
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(60240): ENSG00000000003 ENSG00000000005 ... ENSG00000288674
## ENSG00000288675
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(4): Sample Group Path sizeFactor
##
## $Counts
## class: DESeqDataSet
## dim: 60240 6
## metadata(1): version
## assays(6): counts avgTxLength ... H cooks
## rownames(60240): ENSG00000000003 ENSG00000000005 ... ENSG00000288674
## ENSG00000288675
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
##
## $None
##
## $Normal
##
## $Apeglm
##
## $Ashr
# Subset rows with FDR < alpha
both.df <- rbind(all.resList[[1]][['TPM']],
all.resList[[1]][['Counts']])
both.df$Input <- factor(both.df$Input, levels=TvC)
# Plot distribution of FDR
ggplot(both.df,
aes(x=padj, color=Input)) +
geom_density(size=1, aes(y=..count..)) +
theme_bw() +
ggtitle("Distribution of False Discovery Rate (FDR)") +
xlab("Adjusted P-Value (FDR)") +
ylab("Count") +
geom_vline(xintercept=alpha,
color="black",
linetype="dashed",
size=1) +
scale_x_continuous(breaks=seq(0, 1, by=0.1))
# Initialize a lfc list
lfcplotList <- all.resList
# Create lfc distribution plots
for (i in shrinkage) {
lfc.df <- rbind(all.resList[[i]][[TvC[1]]],
all.resList[[i]][[TvC[2]]])
lfc.df <- lfc.df[lfc.df$FDR == "< 0.1",]
lfc.df$Input <- factor(lfc.df$Input, levels=TvC)
lfcplotList[[i]] <- ggplot(lfc.df, # Subset rows with FDR < alpha
aes(x=log2FoldChange, color=Input)) +
geom_density(size=1, aes(y=..count..)) +
theme_bw() + ylab("Count") +
geom_vline(xintercept=c(mLog[1], mLog[2]),
color="black",
linetype="dashed",
size=1) +
ggtitle(paste("Distribution of Log2FoldChange by Input Type:", i)) +
xlim(-8, 8)
}
# Print the lfc plots
lfcplotList
## $None
##
## $Normal
##
## $Apeglm
##
## $Ashr
# Count number of NA genes
type=c("Zero Counts", "Outliers", "Total NA Genes")
# Create a data frame storing NA gene number
NAstat <- both.df %>%
group_by(Input) %>%
summarize(zero=sum(is.na(log2FoldChange)),
outlier=sum(is.na(pvalue) & is.na(padj))) %>%
mutate(total=zero + outlier) %>%
gather(Type, Number, -Input) %>%
mutate(Type=factor(case_when(Type == "zero" ~ type[1],
Type == "outlier" ~ type[2],
Type == "total" ~ type[3]),
levels=type))
# Plot number of NA genes
ggplot(NAstat,
aes(x=Type, y=Number, group=Input, fill=Input, label=Number)) +
geom_bar(stat="identity", position="dodge") +
theme_bw() +
geom_text(position=position_dodge(width=1), vjust=1.5) +
ggtitle("Number of NA Genes") +
ylab("Number of Genes")
# Create a new list having DE table with FDR below alpha
fdr.rank <- all.resList
lfc.rank <- all.resList
up.lfc.rank <- all.resList
down.lfc.rank <- all.resList
# Set a function cleaning a data frame
filter.fdr.fn <- function(df) {as.data.table(df[df$FDR == FDRv[1],])}
# Set a function creating a column for the rank
Ranking.fn <- function(x) {mutate(x, Rank=1:nrow(x))}
for (i in shrinkage) {
for (x in TvC) {
rankdf <- all.resList[[i]][[x]]
fdr.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% arrange(padj) %>% Ranking.fn()
lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% arrange(desc(abs(log2FoldChange))) %>% Ranking.fn()
up.lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>%
arrange(desc(log2FoldChange)) %>%
Ranking.fn()
down.lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>%
arrange(log2FoldChange) %>%
Ranking.fn()
}
}
# Explore the ranking outputs
head(fdr.rank[[1]][[1]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSG00000169908 2254.333 -4.059309 0.1960933 -20.70091 3.398917e-95
## 2: ENSG00000261008 1643.843 -4.028891 0.2035790 -19.79031 3.608319e-87
## 3: ENSG00000118523 3796.333 -3.494708 0.2090989 -16.71318 1.050769e-62
## 4: ENSG00000136943 11206.252 -3.090493 0.1969195 -15.69420 1.657303e-55
## 5: ENSG00000181104 2467.646 -1.962023 0.1308756 -14.99150 8.344167e-51
## 6: ENSG00000038427 6589.433 -2.228463 0.1488375 -14.97246 1.111298e-50
## padj FDR Input Rank
## 1: 6.328443e-91 < 0.1 TPM 1
## 2: 3.359165e-83 < 0.1 TPM 2
## 3: 6.521422e-59 < 0.1 TPM 3
## 4: 7.714333e-52 < 0.1 TPM 4
## 5: 3.107201e-47 < 0.1 TPM 5
## 6: 3.448543e-47 < 0.1 TPM 6
head(fdr.rank[[1]][[2]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSG00000169908 2288.067 -4.059312 0.1950583 -20.81076 3.458154e-96
## 2: ENSG00000261008 1635.608 -4.023630 0.2035590 -19.76641 5.795623e-87
## 3: ENSG00000118523 3853.040 -3.494245 0.2080393 -16.79608 2.606997e-63
## 4: ENSG00000136943 11365.707 -3.090702 0.1961564 -15.75631 6.215816e-56
## 5: ENSG00000038427 6677.623 -2.228988 0.1487473 -14.98506 9.194038e-51
## 6: ENSG00000181104 2504.476 -1.962507 0.1315082 -14.92307 2.332836e-50
## padj FDR Input Rank
## 1: 6.447037e-92 < 0.1 Counts 1
## 2: 5.402390e-83 < 0.1 Counts 2
## 3: 1.620075e-59 < 0.1 Counts 3
## 4: 2.897037e-52 < 0.1 Counts 4
## 5: 3.428089e-47 < 0.1 Counts 5
## 6: 7.248511e-47 < 0.1 Counts 6
head(lfc.rank[[1]][[1]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSG00000244398 15.377178 -20.124630 3.909696 -5.147364 2.641722e-07
## 2: ENSG00000283809 30.167576 8.377090 1.268057 6.606240 3.942030e-11
## 3: ENSG00000169085 24.416072 -8.054346 1.314483 -6.127388 8.933355e-10
## 4: ENSG00000248167 14.783865 -7.337185 1.355943 -5.411133 6.262727e-08
## 5: ENSG00000263958 8.897781 -6.589416 1.758112 -3.748006 1.782457e-04
## 6: ENSG00000198796 46.727730 -6.571173 1.515131 -4.337033 1.444187e-05
## padj FDR Input Rank
## 1: 9.606684e-06 < 0.1 TPM 1
## 2: 3.163649e-09 < 0.1 TPM 2
## 3: 5.525918e-08 < 0.1 TPM 3
## 4: 2.632183e-06 < 0.1 TPM 4
## 5: 2.655005e-03 < 0.1 TPM 5
## 6: 3.185938e-04 < 0.1 TPM 6
head(lfc.rank[[1]][[2]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSG00000244398 15.65189 -20.764231 3.9096818 -5.310977 1.090390e-07
## 2: ENSG00000283809 30.69086 8.386429 1.2655524 6.626694 3.432871e-11
## 3: ENSG00000169085 22.92788 -7.974919 1.3156842 -6.061423 1.349221e-09
## 4: ENSG00000198796 45.23995 -7.672070 1.3269181 -5.781871 7.387415e-09
## 5: ENSG00000248167 14.92215 -7.336534 1.3505819 -5.432128 5.568592e-08
## 6: ENSG00000122641 129.68367 -5.939531 0.5738227 -10.350813 4.149479e-25
## padj FDR Input Rank
## 1: 4.306810e-06 < 0.1 Counts 1
## 2: 2.746739e-09 < 0.1 Counts 2
## 3: 8.114042e-08 < 0.1 Counts 3
## 4: 3.794038e-07 < 0.1 Counts 4
## 5: 2.354087e-06 < 0.1 Counts 5
## 6: 2.035756e-22 < 0.1 Counts 6
head(up.lfc.rank[[1]][[1]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSG00000283809 30.167576 8.377090 1.2680572 6.606240 3.942030e-11
## 2: ENSG00000271723 19.774915 5.193793 1.0422077 4.983453 6.245960e-07
## 3: ENSG00000227373 41.205040 4.620244 1.3222441 3.494245 4.754050e-04
## 4: ENSG00000265168 17.724166 4.561790 0.9740951 4.683105 2.825613e-06
## 5: ENSG00000283698 9.252657 3.924808 1.6104430 2.437098 1.480565e-02
## 6: ENSG00000279750 9.870121 3.817917 1.1405856 3.347331 8.159385e-04
## padj FDR Input Rank
## 1: 3.163649e-09 < 0.1 TPM 1
## 2: 2.072968e-05 < 0.1 TPM 2
## 3: 5.976749e-03 < 0.1 TPM 3
## 4: 7.691535e-05 < 0.1 TPM 4
## 5: 8.725174e-02 < 0.1 TPM 5
## 6: 9.218422e-03 < 0.1 TPM 6
head(up.lfc.rank[[1]][[2]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSG00000283809 30.690860 8.386429 1.2655524 6.626694 3.432871e-11
## 2: ENSG00000227373 26.791564 4.849108 0.8326809 5.823489 5.763157e-09
## 3: ENSG00000271723 20.185952 4.762474 0.9708544 4.905446 9.321512e-07
## 4: ENSG00000265168 17.958506 4.569418 0.9644351 4.737922 2.159209e-06
## 5: ENSG00000279750 9.980378 3.812218 1.1296849 3.374585 7.392702e-04
## 6: ENSG00000251177 9.063109 3.763735 1.0759497 3.498059 4.686582e-04
## padj FDR Input Rank
## 1: 2.746739e-09 < 0.1 Counts 1
## 2: 3.043698e-07 < 0.1 Counts 2
## 3: 2.867770e-05 < 0.1 Counts 3
## 4: 6.008079e-05 < 0.1 Counts 4
## 5: 8.540299e-03 < 0.1 Counts 5
## 6: 5.895543e-03 < 0.1 Counts 6
head(down.lfc.rank[[1]][[1]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSG00000244398 15.377178 -20.124630 3.9096963 -5.147364 2.641722e-07
## 2: ENSG00000169085 24.416072 -8.054346 1.3144829 -6.127388 8.933355e-10
## 3: ENSG00000248167 14.783865 -7.337185 1.3559425 -5.411133 6.262727e-08
## 4: ENSG00000263958 8.897781 -6.589416 1.7581122 -3.748006 1.782457e-04
## 5: ENSG00000198796 46.727730 -6.571173 1.5151307 -4.337033 1.444187e-05
## 6: ENSG00000205277 31.406357 -5.965202 0.9969242 -5.983606 2.182508e-09
## padj FDR Input Rank
## 1: 9.606684e-06 < 0.1 TPM 1
## 2: 5.525918e-08 < 0.1 TPM 2
## 3: 2.632183e-06 < 0.1 TPM 3
## 4: 2.655005e-03 < 0.1 TPM 4
## 5: 3.185938e-04 < 0.1 TPM 5
## 6: 1.269879e-07 < 0.1 TPM 6
head(down.lfc.rank[[1]][[2]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSG00000244398 15.65189 -20.764231 3.9096818 -5.310977 1.090390e-07
## 2: ENSG00000169085 22.92788 -7.974919 1.3156842 -6.061423 1.349221e-09
## 3: ENSG00000198796 45.23995 -7.672070 1.3269181 -5.781871 7.387415e-09
## 4: ENSG00000248167 14.92215 -7.336534 1.3505819 -5.432128 5.568592e-08
## 5: ENSG00000122641 129.68367 -5.939531 0.5738227 -10.350813 4.149479e-25
## 6: ENSG00000205277 19.72852 -5.935479 0.9101660 -6.521315 6.969362e-11
## padj FDR Input Rank
## 1: 4.306810e-06 < 0.1 Counts 1
## 2: 8.114042e-08 < 0.1 Counts 2
## 3: 3.794038e-07 < 0.1 Counts 3
## 4: 2.354087e-06 < 0.1 Counts 4
## 5: 2.035756e-22 < 0.1 Counts 5
## 6: 5.391278e-09 < 0.1 Counts 6
# Set a function rebuilding DE tables with gene ranks
rankdiff.fn <- function(List){
# Select columns and join the data frames by gene
full_join(List[[TvC[1]]][, .(Gene, Input, Rank, baseMean)],
List[[TvC[2]]][, .(Gene, Input, Rank, baseMean)],
by="Gene") %>%
# Add columns assining gene expression levels, rank differences, and mean ranks
mutate(logMeanExpression=log(baseMean.x+baseMean.y/2),
RankDiff=Rank.x-Rank.y,
MeanRank=(Rank.x+Rank.y)/2)
}
# Set a function adding Shrinkage column
add.shr.fn <- function(df, shr) {mutate(df, Shrinkage=shr)}
# Set a function rbinding multiple data frames
rbinds.fn <- function(List) {rbind(List[[1]],
List[[2]],
List[[3]],
List[[4]])}
# Calculate and plot rank difference
for (i in shrinkage) {
# Calculate rank difference
fdr.rank[[i]] <- rankdiff.fn(fdr.rank[[i]]) %>% add.shr.fn(i)
lfc.rank[[i]] <- rankdiff.fn(lfc.rank[[i]]) %>% add.shr.fn(i)
up.lfc.rank[[i]] <- rankdiff.fn(up.lfc.rank[[i]]) %>% add.shr.fn(i)
down.lfc.rank[[i]] <- rankdiff.fn(down.lfc.rank[[i]]) %>% add.shr.fn(i)
}
# Create combined data frames across the shrinkages
fdr.rank.df <- rbinds.fn(fdr.rank)
lfc.rank.df <- rbinds.fn(lfc.rank)
up.lfc.rank.df <- rbinds.fn(up.lfc.rank)
down.lfc.rank.df <- rbinds.fn(down.lfc.rank)
# Explore the ranking outputs
head(fdr.rank.df)
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000169908 TPM 1 2254.333 Counts 1 2288.067
## 2: ENSG00000261008 TPM 2 1643.843 Counts 2 1635.608
## 3: ENSG00000118523 TPM 3 3796.333 Counts 3 3853.040
## 4: ENSG00000136943 TPM 4 11206.252 Counts 4 11365.707
## 5: ENSG00000181104 TPM 5 2467.646 Counts 6 2504.476
## 6: ENSG00000038427 TPM 6 6589.433 Counts 5 6677.623
## logMeanExpression RankDiff MeanRank Shrinkage
## 1: 8.131050 0 1.0 None
## 2: 7.808586 0 2.0 None
## 3: 8.652223 0 3.0 None
## 4: 9.734424 0 4.0 None
## 5: 8.221448 -1 5.5 None
## 6: 9.203139 1 5.5 None
head(lfc.rank.df)
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000244398 TPM 1 15.377178 Counts 1 15.65189
## 2: ENSG00000283809 TPM 2 30.167576 Counts 2 30.69086
## 3: ENSG00000169085 TPM 3 24.416072 Counts 3 22.92788
## 4: ENSG00000248167 TPM 4 14.783865 Counts 5 14.92215
## 5: ENSG00000263958 TPM 5 8.897781 <NA> NA NA
## 6: ENSG00000198796 TPM 6 46.727730 Counts 4 45.23995
## logMeanExpression RankDiff MeanRank Shrinkage
## 1: 3.144287 0 1.0 None
## 2: 3.817998 0 2.0 None
## 3: 3.580180 0 3.0 None
## 4: 3.102114 -1 4.5 None
## 5: NA NA NA None
## 6: 4.239133 2 5.0 None
head(up.lfc.rank.df)
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000283809 TPM 1 30.167576 Counts 1 30.690860
## 2: ENSG00000271723 TPM 2 19.774915 Counts 3 20.185952
## 3: ENSG00000227373 TPM 3 41.205040 Counts 2 26.791564
## 4: ENSG00000265168 TPM 4 17.724166 Counts 4 17.958506
## 5: ENSG00000283698 TPM 5 9.252657 <NA> NA NA
## 6: ENSG00000279750 TPM 6 9.870121 Counts 5 9.980378
## logMeanExpression RankDiff MeanRank Shrinkage
## 1: 3.817998 0 1.0 None
## 2: 3.396784 -1 2.5 None
## 3: 4.000049 1 2.5 None
## 4: 3.284792 0 4.0 None
## 5: NA NA NA None
## 6: 2.698694 1 5.5 None
head(down.lfc.rank.df)
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000244398 TPM 1 15.377178 Counts 1 15.65189
## 2: ENSG00000169085 TPM 2 24.416072 Counts 2 22.92788
## 3: ENSG00000248167 TPM 3 14.783865 Counts 4 14.92215
## 4: ENSG00000263958 TPM 4 8.897781 <NA> NA NA
## 5: ENSG00000198796 TPM 5 46.727730 Counts 3 45.23995
## 6: ENSG00000205277 TPM 6 31.406357 Counts 6 19.72852
## logMeanExpression RankDiff MeanRank Shrinkage
## 1: 3.144287 0 1.0 None
## 2: 3.580180 0 2.0 None
## 3: 3.102114 -1 3.5 None
## 4: NA NA NA None
## 5: 4.239133 2 4.0 None
## 6: 3.720151 0 6.0 None
# Set a function plotting gene ranks between TPM- (x-axis) and Counts-Inputs (y-axis)
ranking.plot.fn <- function(df, rankedby) {
df$Shrinkage <- factor(df$Shrinkage, levels=shrinkage)
ggplot(df,
aes(x=Rank.x, y=Rank.y, color=logMeanExpression)) + geom_point(alpha=0.5) + facet_grid(~Shrinkage) + theme_bw() + theme(strip.text.x=element_text(size=10)) + xlab("Rank with TPM") + ylab("Rank with Counts") + geom_abline(slope=1, color="black", size=0.5) + ggtitle(paste(rankedby, "Ranking with TPM vs Count Inputs")) + scale_color_gradient(low="blue", high="red")
}
# Print output plots
ranking.plot.fn(fdr.rank.df, "FDR")
ranking.plot.fn(lfc.rank.df, "Log2FoldChange")
ranking.plot.fn(up.lfc.rank.df, "Log2FoldChange (Increase)")
ranking.plot.fn(down.lfc.rank.df, "Log2FoldChange (Decrease)")
# Set a function plotting the rank difference over the gene expression level
rankdiff.plot.fn <- function(df, rankedby, ylim) {
df$Shrinkage <- factor(df$Shrinkage, levels=shrinkage)
ggplot(df, aes(x=logMeanExpression, y=RankDiff, color=MeanRank)) +
geom_point(alpha=0.5) +
theme_bw() +
facet_grid(~Shrinkage) +
theme(strip.text.x=element_text(size=10)) +
ylab("Rank Difference (TPM - Count)") +
ggtitle(paste("Rank Difference (TPM - Count):", rankedby)) +
geom_hline(yintercept=0, color="black", size=0.5) + scale_color_gradient(low="blue", high="red")
}
# Print output plots
rankdiff.plot.fn(fdr.rank.df, "FDR", 100)
rankdiff.plot.fn(lfc.rank.df, "Log2FoldChange", 120)
rankdiff.plot.fn(up.lfc.rank.df, "Log2FoldChange (Increase)", 100)
rankdiff.plot.fn(down.lfc.rank.df, "Log2FoldChange (Decrease)", 75)
# Set a function subsetting unshrunken data
unshr.rankdiff.fn <- function(df) {subset(df, Shrinkage == "None")}
# Create a list storing rankdiff data frames
rankList <- list(unshr.rankdiff.fn(fdr.rank.df),
unshr.rankdiff.fn(lfc.rank.df),
unshr.rankdiff.fn(up.lfc.rank.df),
unshr.rankdiff.fn(down.lfc.rank.df))
# Assine result column as a factor with levels
rankdiff.levels <- c("FDR",
"log2FoldChange",
"log2FoldChange.Increase",
"log2FoldChange.Decrease")
# Name the list
names(rankList) <- rankdiff.levels
# Create a new data frame storing rank difference by result type
rankdiff.dist <- data.frame(FDR=abs(rankList[[1]]$RankDiff),
log2FoldChange=abs(rankList[[2]]$RankDiff),
log2FoldChange.Increase=abs(rankList[[3]]$RankDiff),
log2FoldChange.Decrease=abs(rankList[[4]]$RankDiff)) %>% gather(Result, RankDiff)
rankdiff.dist$Result <- factor(rankdiff.dist$Result, levels=rankdiff.levels)
# Plot distribution of absolute rank difference
ggplot(rankdiff.dist,
aes(x=Result, y=RankDiff, color=Result)) +
geom_jitter(alpha=0.5) +
geom_boxplot(alpha=0.5, fill="grey", color="black") +
theme_bw() +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
ggtitle("Distribution of Absolute Rank Difference without Shrinkage \n(TPM - Count Input)") +
ylab("Absolute Rank Difference (TPM - Count Input)")
# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>%
group_by(GENEID) %>%
summarize(num.trans=n_distinct(TXID))
# Set a function adding the number of transcripts by gene id
add.ntrans.fn <- function(df) {
left_join(df, AnnoDb.ntrans, by=c("Gene"="GENEID"))}
# Add a column indicating the number of transcripts by gene id to every rankdiff data frame
for (x in rankdiff.levels) {
rankList[[x]] <- add.ntrans.fn(rankList[[x]])
}
# Explore the edited data frames
summary(rankList)
## Length Class Mode
## FDR 12 data.table list
## log2FoldChange 12 data.table list
## log2FoldChange.Increase 12 data.table list
## log2FoldChange.Decrease 12 data.table list
head(rankList[[1]])
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000169908 TPM 1 2254.333 Counts 1 2288.067
## 2: ENSG00000261008 TPM 2 1643.843 Counts 2 1635.608
## 3: ENSG00000118523 TPM 3 3796.333 Counts 3 3853.040
## 4: ENSG00000136943 TPM 4 11206.252 Counts 4 11365.707
## 5: ENSG00000181104 TPM 5 2467.646 Counts 6 2504.476
## 6: ENSG00000038427 TPM 6 6589.433 Counts 5 6677.623
## logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1: 8.131050 0 1.0 None 4
## 2: 7.808586 0 2.0 None 18
## 3: 8.652223 0 3.0 None 1
## 4: 9.734424 0 4.0 None 3
## 5: 8.221448 -1 5.5 None 2
## 6: 9.203139 1 5.5 None 12
head(rankList[[2]])
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000244398 TPM 1 15.377178 Counts 1 15.65189
## 2: ENSG00000283809 TPM 2 30.167576 Counts 2 30.69086
## 3: ENSG00000169085 TPM 3 24.416072 Counts 3 22.92788
## 4: ENSG00000248167 TPM 4 14.783865 Counts 5 14.92215
## 5: ENSG00000263958 TPM 5 8.897781 <NA> NA NA
## 6: ENSG00000198796 TPM 6 46.727730 Counts 4 45.23995
## logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1: 3.144287 0 1.0 None 1
## 2: 3.817998 0 2.0 None 1
## 3: 3.580180 0 3.0 None 10
## 4: 3.102114 -1 4.5 None 1
## 5: NA NA NA None 3
## 6: 4.239133 2 5.0 None 6
head(rankList[[3]])
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000283809 TPM 1 30.167576 Counts 1 30.690860
## 2: ENSG00000271723 TPM 2 19.774915 Counts 3 20.185952
## 3: ENSG00000227373 TPM 3 41.205040 Counts 2 26.791564
## 4: ENSG00000265168 TPM 4 17.724166 Counts 4 17.958506
## 5: ENSG00000283698 TPM 5 9.252657 <NA> NA NA
## 6: ENSG00000279750 TPM 6 9.870121 Counts 5 9.980378
## logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1: 3.817998 0 1.0 None 1
## 2: 3.396784 -1 2.5 None 4
## 3: 4.000049 1 2.5 None 7
## 4: 3.284792 0 4.0 None 1
## 5: NA NA NA None 2
## 6: 2.698694 1 5.5 None 2
head(rankList[[4]])
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000244398 TPM 1 15.377178 Counts 1 15.65189
## 2: ENSG00000169085 TPM 2 24.416072 Counts 2 22.92788
## 3: ENSG00000248167 TPM 3 14.783865 Counts 4 14.92215
## 4: ENSG00000263958 TPM 4 8.897781 <NA> NA NA
## 5: ENSG00000198796 TPM 5 46.727730 Counts 3 45.23995
## 6: ENSG00000205277 TPM 6 31.406357 Counts 6 19.72852
## logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1: 3.144287 0 1.0 None 1
## 2: 3.580180 0 2.0 None 10
## 3: 3.102114 -1 3.5 None 1
## 4: NA NA NA None 3
## 5: 4.239133 2 4.0 None 6
## 6: 3.720151 0 6.0 None 6
# Set a function plotting rank difference vs number of transcripts
rank.ntrans.plot.fn <- function(df, title) {
ggplot(df, aes(x=num.trans, y=abs(RankDiff), color=MeanRank)) +
geom_jitter(alpha=0.5) +
theme_bw() +
ggtitle(paste("Rank Difference vs Number of Alternative Transcripts \nin", title)) +
xlab("Number of Alternative Transcripts") +
ylab("Absolute Rank Difference \n(TPM - Counts)") + scale_color_gradient(low="blue", high="red")
}
# Print the plots
rank.ntrans.plot.fn(rankList[[1]], "FDR")
rank.ntrans.plot.fn(rankList[[2]], "log2FoldChange")
rank.ntrans.plot.fn(rankList[[3]], "log2FoldChange (Increase)")
rank.ntrans.plot.fn(rankList[[4]], "log2FoldChange (Decrease)")
# Initialize a list storing rankdiff genes
large.rankdiff <- rankList
# Assign a vector storing minimum (thresholds) rankdiff for filtering large rankdiff genes
rankdiff.thr <- c(10, 10, 10, 10)
names(rankdiff.thr) <- rankdiff.levels
for (x in rankdiff.levels) {
# Filter out observations below the rankdiff thresholds
large.rankdiff[[x]] <- subset(rankList[[x]],
abs(RankDiff) > rankdiff.thr[x])
}
# Explore the filtered genes
summary(large.rankdiff)
## Length Class Mode
## FDR 12 data.table list
## log2FoldChange 12 data.table list
## log2FoldChange.Increase 12 data.table list
## log2FoldChange.Decrease 12 data.table list
dim(large.rankdiff[[rankdiff.levels[1]]])
## [1] 1968 12
dim(large.rankdiff[[rankdiff.levels[2]]])
## [1] 1330 12
dim(large.rankdiff[[rankdiff.levels[3]]])
## [1] 1036 12
dim(large.rankdiff[[rankdiff.levels[4]]])
## [1] 846 12
# Set a function saving rankdiff.csv files
saving.fn <- function(input.df, data.type) {
filename <- paste0("tpmVScount_", data.type, "_RankDiff.csv")
return(write.csv(input.df, filename))
}
# Save the filtered and arranged data frames as csv files
saving.fn(large.rankdiff[[rankdiff.levels[1]]], "FDR")
saving.fn(large.rankdiff[[rankdiff.levels[2]]], "LFC")
saving.fn(large.rankdiff[[rankdiff.levels[3]]], "LFC_Increase")
saving.fn(large.rankdiff[[rankdiff.levels[4]]], "LFC_Decrease")
# Set a function cleaning data to generate upset plots
upset.input.fn <- function(df) {
# Filter genes with valid padj
df <- subset(df, !is.na(padj)) %>%
mutate(Up=ifelse(FDR == FDRv[1] & log2FoldChange > 0, Gene, ""), # What are upregulated genes?
Down=ifelse(FDR == FDRv[1] & log2FoldChange < 0, Gene, ""), # What are downregulated genes?
Unchanged=ifelse(FDR == FDRv[2], Gene, ""), # What are unchanged genes?
TPM_Input=ifelse(Input == "TPM", Gene, ""), # What are the genes from TPM input?
Counts_Input=ifelse(Input == "Counts", Gene, "")) # What are the genes from Counts input?
# Create a list storing groups of interest
upsetInput <- list(Up=df$Up,
Down=df$Down,
Unchanged=df$Unchanged,
TPM_Input=df$TPM,
Counts_Input=df$Counts)
return(upsetInput)
}
upsetList <- upset.input.fn(both.df)
# Create the upset plot
upset(fromList(upsetList),
sets=names(upsetList), # What group to display
sets.x.label="Number of Genes per Group",
order.by="freq",
point.size=3,
sets.bar.color=c("red", "red","blue", "blue", "blue"),
text.scale = 1.5, number.angles=30)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
##
## Matrix products: default
## BLAS/LAPACK: /home/mira/miniconda3/envs/r/lib/libopenblasp-r0.3.10.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ashr_2.2-47 apeglm_1.12.0
## [3] ensembldb_2.14.0 AnnotationFilter_1.14.0
## [5] GenomicFeatures_1.42.1 AnnotationDbi_1.52.0
## [7] UpSetR_1.4.0 gridExtra_2.3
## [9] pheatmap_1.0.12 DESeq2_1.30.0
## [11] SummarizedExperiment_1.20.0 Biobase_2.50.0
## [13] MatrixGenerics_1.2.0 matrixStats_0.57.0
## [15] GenomicRanges_1.42.0 GenomeInfoDb_1.26.2
## [17] IRanges_2.24.0 S4Vectors_0.28.1
## [19] tximport_1.18.0 forcats_0.5.0
## [21] stringr_1.4.0 dplyr_1.0.2
## [23] purrr_0.3.4 readr_1.4.0
## [25] tidyr_1.1.2 tibble_3.0.4
## [27] ggplot2_3.3.2 tidyverse_1.3.0
## [29] AnnotationHub_2.22.0 BiocFileCache_1.14.0
## [31] dbplyr_2.0.0 BiocGenerics_0.36.0
## [33] rmarkdown_2.5 data.table_1.13.4
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.2.1
## [3] plyr_1.8.6 lazyeval_0.2.2
## [5] splines_4.0.3 BiocParallel_1.24.1
## [7] digest_0.6.27 invgamma_1.1
## [9] htmltools_0.5.0 SQUAREM_2020.5
## [11] fansi_0.4.1 magrittr_2.0.1
## [13] memoise_1.1.0 Biostrings_2.58.0
## [15] annotate_1.68.0 modelr_0.1.8
## [17] askpass_1.1 bdsmatrix_1.3-4
## [19] prettyunits_1.1.1 colorspace_2.0-0
## [21] blob_1.2.1 rvest_0.3.6
## [23] rappdirs_0.3.1 haven_2.3.1
## [25] xfun_0.19 crayon_1.3.4
## [27] RCurl_1.98-1.2 jsonlite_1.7.2
## [29] genefilter_1.72.0 survival_3.2-7
## [31] glue_1.4.2 gtable_0.3.0
## [33] zlibbioc_1.36.0 XVector_0.30.0
## [35] DelayedArray_0.16.0 scales_1.1.1
## [37] mvtnorm_1.1-1 DBI_1.1.0
## [39] Rcpp_1.0.5 xtable_1.8-4
## [41] progress_1.2.2 emdbook_1.3.12
## [43] bit_4.0.4 truncnorm_1.0-8
## [45] httr_1.4.2 RColorBrewer_1.1-2
## [47] ellipsis_0.3.1 farver_2.0.3
## [49] pkgconfig_2.0.3 XML_3.99-0.5
## [51] utf8_1.1.4 locfit_1.5-9.4
## [53] labeling_0.4.2 tidyselect_1.1.0
## [55] rlang_0.4.9 later_1.1.0.1
## [57] munsell_0.5.0 BiocVersion_3.12.0
## [59] cellranger_1.1.0 tools_4.0.3
## [61] cli_2.2.0 generics_0.1.0
## [63] RSQLite_2.2.1 broom_0.7.2
## [65] evaluate_0.14 fastmap_1.0.1
## [67] yaml_2.2.1 knitr_1.30
## [69] bit64_4.0.5 fs_1.5.0
## [71] mime_0.9 xml2_1.3.2
## [73] biomaRt_2.46.0 compiler_4.0.3
## [75] rstudioapi_0.13 curl_4.3
## [77] interactiveDisplayBase_1.28.0 reprex_0.3.0
## [79] geneplotter_1.68.0 stringi_1.5.3
## [81] ps_1.5.0 lattice_0.20-41
## [83] ProtGenerics_1.22.0 Matrix_1.2-18
## [85] vctrs_0.3.5 pillar_1.4.7
## [87] lifecycle_0.2.0 BiocManager_1.30.10
## [89] bitops_1.0-6 irlba_2.3.3
## [91] httpuv_1.5.4 rtracklayer_1.50.0
## [93] R6_2.5.0 promises_1.1.1
## [95] MASS_7.3-53 assertthat_0.2.1
## [97] openssl_1.4.3 withr_2.3.0
## [99] GenomicAlignments_1.26.0 Rsamtools_2.6.0
## [101] GenomeInfoDbData_1.2.4 hms_0.5.3
## [103] grid_4.0.3 coda_0.19-4
## [105] mixsqp_0.3-43 bbmle_1.0.23.1
## [107] numDeriv_2016.8-1.1 shiny_1.5.0
## [109] lubridate_1.7.9.2